{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import torch\n",
    "import numpy as np\n",
    "from torchvision import models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "mode_list = \"default reduce-overhead max-autotune\".split()\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 实验一：sin函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mode: default, 编译耗时:0.00，编译前运行耗时:1.41, 编译后运行耗时:3.26，速度提升比例:-130.68%\n",
      "mode: reduce-overhead, 编译耗时:0.00，编译前运行耗时:1.36, 编译后运行耗时:5.25，速度提升比例:-284.67%\n",
      "mode: max-autotune, 编译耗时:0.00，编译前运行耗时:1.34, 编译后运行耗时:5.30，速度提升比例:-296.58%\n"
     ]
    }
   ],
   "source": [
    "def sin_func(x):\n",
    "    return torch.sin(x) + torch.cos(x)\n",
    "\n",
    "run_times = 100000\n",
    "i_data = torch.tensor(1).to(device)\n",
    "for mode in mode_list:\n",
    "    torch.cuda.synchronize()\n",
    "    time_0 = time.time()\n",
    "    module_compiled = torch.compile(sin_func, mode=mode)\n",
    "    torch.cuda.synchronize()\n",
    "    time_1 = time.time()\n",
    "    \n",
    "    # warmup\n",
    "    sin_func(i_data)\n",
    "    module_compiled(i_data)\n",
    "    \n",
    "    torch.cuda.synchronize()\n",
    "    time_2 = time.time()\n",
    "    for i in range(run_times):\n",
    "        sin_func(i_data)\n",
    "        \n",
    "    torch.cuda.synchronize()\n",
    "    time_3 = time.time()\n",
    "    for i in range(run_times):\n",
    "        module_compiled(i_data)\n",
    "    torch.cuda.synchronize()\n",
    "    time_4 = time.time()\n",
    "    \n",
    "    compile_time = time_1 - time_0\n",
    "    pre_time = time_3 - time_2\n",
    "    post_time = time_4 - time_3\n",
    "    speedup_ratio = (pre_time - post_time)/pre_time\n",
    "    \n",
    "    print(f\"mode: {mode}, 编译耗时:{compile_time:.2f}，编译前运行耗时:{pre_time:.2f}, 编译后运行耗时:{post_time:.2f}，速度提升比例:{speedup_ratio:.2%}\")\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 实验二：resnet18"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "resnet18 = models.resnet18().to(device)\n",
    "resnet18.eval()\n",
    "fake_img = torch.randn(16, 3, 224, 224).to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mode: default, 编译耗时:0.00，编译前运行耗时:0.90, 编译后运行耗时:0.69，速度提升比例:24.05%\n",
      "mode: reduce-overhead, 编译耗时:0.00，编译前运行耗时:1.03, 编译后运行耗时:1.05，速度提升比例:-1.87%\n",
      "mode: max-autotune, 编译耗时:0.00，编译前运行耗时:1.03, 编译后运行耗时:0.98，速度提升比例:5.29%\n"
     ]
    }
   ],
   "source": [
    "run_times = 100\n",
    "with torch.no_grad():\n",
    "    for mode in mode_list:\n",
    "        torch.cuda.synchronize()\n",
    "        time_0 = time.time()\n",
    "        module_compiled = torch.compile(resnet18, mode=mode)\n",
    "        torch.cuda.synchronize()\n",
    "        time_1 = time.time()\n",
    "        \n",
    "        # warmup 非常关键！\n",
    "        resnet18(fake_img)\n",
    "        module_compiled(fake_img)\n",
    "        \n",
    "        #\n",
    "        torch.cuda.synchronize()\n",
    "        time_2 = time.time()\n",
    "        for i in range(run_times):\n",
    "            resnet18(fake_img)\n",
    "        \n",
    "        torch.cuda.synchronize()\n",
    "        time_3 = time.time()\n",
    "        for i in range(run_times):\n",
    "            module_compiled(fake_img)\n",
    "        \n",
    "        torch.cuda.synchronize()\n",
    "        time_4 = time.time()\n",
    "\n",
    "        compile_time = time_1 - time_0\n",
    "        pre_time = time_3 - time_2\n",
    "        post_time = time_4 - time_3\n",
    "        speedup_ratio = (pre_time - post_time)/pre_time\n",
    "\n",
    "        print(f\"mode: {mode}, 编译耗时:{compile_time:.2f}，编译前运行耗时:{pre_time:.2f}, 编译后运行耗时:{post_time:.2f}，速度提升比例:{speedup_ratio:.2%}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 实验三：BERT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import BertModel, BertTokenizer\n",
    "import time\n",
    "\n",
    "bert = BertModel.from_pretrained('bert-base-uncased')\n",
    "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mode: default, 编译耗时:0.00，编译前运行耗时:0.55, 编译后运行耗时:0.22，速度提升比例:60.65%\n",
      "mode: reduce-overhead, 编译耗时:0.00，编译前运行耗时:0.67, 编译后运行耗时:0.52，速度提升比例:22.55%\n",
      "mode: max-autotune, 编译耗时:0.00，编译前运行耗时:0.48, 编译后运行耗时:0.56，速度提升比例:-15.52%\n"
     ]
    }
   ],
   "source": [
    "# 准备一批输入数据\n",
    "input_text = \"Here is some text to encode\"\n",
    "inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)\n",
    "inputs = {k: v.to(device) for k, v in inputs.items()}\n",
    "bert.to(device)\n",
    "bert.eval()\n",
    "\n",
    "run_times = 100\n",
    "with torch.no_grad():\n",
    "    for mode in mode_list:\n",
    "        \n",
    "        # 编译\n",
    "        torch.cuda.synchronize()\n",
    "        time_0 = time.time()\n",
    "        bert_compiled = torch.compile(bert, mode=mode)\n",
    "        torch.cuda.synchronize()\n",
    "        time_1 = time.time()\n",
    "        \n",
    "        # warmup 非常关键！\n",
    "        bert(**inputs)\n",
    "        bert_compiled(**inputs)\n",
    "\n",
    "        torch.cuda.synchronize()\n",
    "        time_2= time.time()\n",
    "        for _ in range(run_times): \n",
    "            _ = bert(**inputs)\n",
    "\n",
    "        torch.cuda.synchronize()\n",
    "        time_3= time.time()\n",
    "        for _ in range(run_times):\n",
    "            _ = bert_compiled(**inputs)\n",
    "        \n",
    "        torch.cuda.synchronize()\n",
    "        time_4= time.time()\n",
    "        \n",
    "        compile_time = time_1 - time_0\n",
    "        pre_time = time_3 - time_2\n",
    "        post_time = time_4 - time_3\n",
    "        speedup_ratio = (pre_time - post_time)/pre_time\n",
    "        \n",
    "        \n",
    "        print(f\"mode: {mode}, 编译耗时:{compile_time:.2f}，编译前运行耗时:{pre_time:.2f}, 编译后运行耗时:{post_time:.2f}，速度提升比例:{speedup_ratio:.2%}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 实验四 numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mode: default, 编译耗时:0.00，编译前运行耗时:87.93, 编译后运行耗时:6.19，速度提升比例:92.96%\n",
      "mode: reduce-overhead, 编译耗时:0.00，编译前运行耗时:87.21, 编译后运行耗时:6.22，速度提升比例:92.86%\n",
      "mode: max-autotune, 编译耗时:0.00，编译前运行耗时:86.77, 编译后运行耗时:5.98，速度提升比例:93.11%\n"
     ]
    }
   ],
   "source": [
    "run_times = 100\n",
    "\n",
    "def numpy_fn2(X: np.ndarray, Y: np.ndarray) -> np.ndarray:\n",
    "    return np.sum(X[:, :, None] * Y[:, None, :], axis=(-2, -1))\n",
    "\n",
    "def numpy_fn(X: np.ndarray, Y: np.ndarray) -> np.ndarray:\n",
    "    # Step 1: Normalize the input arrays to have zero mean and unit variance\n",
    "    X_mean, X_std = X.mean(axis=0), X.std(axis=0)\n",
    "    Y_mean, Y_std = Y.mean(axis=0), Y.std(axis=0)\n",
    "    \n",
    "    # Avoid division by zero in case of zero standard deviation\n",
    "    X_std[X_std == 0] = 1\n",
    "    Y_std[Y_std == 0] = 1\n",
    "    \n",
    "    X_normalized = (X - X_mean) / X_std\n",
    "    Y_normalized = (Y - Y_mean) / Y_std\n",
    "    \n",
    "    # Step 2: Perform the tensor product followed by sum over last two dimensions\n",
    "    intermediate_result = np.sum(X_normalized[:, :, None] * Y_normalized[:, None, :], axis=(-2, -1))\n",
    "    \n",
    "    # Step 3: Apply thresholding to clip values outside of [-1, 1]\n",
    "    intermediate_result = np.clip(intermediate_result, -1, 1)\n",
    "    \n",
    "    # Step 4: Apply exponential function for non-linearity\n",
    "    result = np.exp(intermediate_result)\n",
    "    \n",
    "    # Step 5: Add a small regularization term to avoid overfitting\n",
    "    regularization_term = 0.001 * np.sum(X_normalized ** 2 + Y_normalized ** 2, axis=1)\n",
    "    result += regularization_term\n",
    "    \n",
    "    return result\n",
    "\n",
    "x = np.random.randn(1024, 640)\n",
    "y = np.random.randn(1024, 640)\n",
    "\n",
    "for mode in mode_list:\n",
    "    torch.cuda.synchronize()\n",
    "    time_0 = time.time()\n",
    "    numpy_fn_compiled = torch.compile(numpy_fn, mode=mode)\n",
    "    torch.cuda.synchronize()\n",
    "    time_1 = time.time()\n",
    "\n",
    "    # warmup 非常关键！\n",
    "    numpy_fn(x, y)\n",
    "    numpy_fn_compiled(x, y)\n",
    "\n",
    "    #\n",
    "    torch.cuda.synchronize()\n",
    "    time_2 = time.time()\n",
    "    for i in range(run_times):\n",
    "        numpy_fn(x, y)\n",
    "\n",
    "    torch.cuda.synchronize()\n",
    "    time_3 = time.time()\n",
    "    for i in range(run_times):\n",
    "        numpy_fn_compiled(x, y)\n",
    "\n",
    "    torch.cuda.synchronize()\n",
    "    time_4 = time.time()\n",
    "\n",
    "    compile_time = time_1 - time_0\n",
    "    pre_time = time_3 - time_2\n",
    "    post_time = time_4 - time_3\n",
    "    speedup_ratio = (pre_time - post_time)/pre_time\n",
    "\n",
    "    print(f\"mode: {mode}, 编译耗时:{compile_time:.2f}，编译前运行耗时:{pre_time:.2f}, 编译后运行耗时:{post_time:.2f}，速度提升比例:{speedup_ratio:.2%}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py311",
   "language": "python",
   "name": "py311"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
